To install the kernel used by NERSC-metatlas users, copy the following text to $HOME/.ipython/kernels/mass_spec_cori/kernel.json
{
"argv": [
"/global/common/software/m2650/python-cori/bin/python",
"-m",
"IPython.kernel",
"-f",
"{connection_file}"
],
"env": {
"PATH": "/global/common/software/m2650/python-cori/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
},
"display_name": "mass_spec_cori",
"language": "python"
}
In [1]:
from IPython.core.display import Markdown, display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib notebook
%matplotlib inline
%env HDF5_USE_FILE_LOCKING=FALSE
import sys, os
#### add a path to your private code if not using production code ####
#print ('point path to metatlas repo')
sys.path.insert(0,"/global/homes/v/vrsingan/repos/metatlas/") #where your private code is
######################################################################
from metatlas.helpers import dill2plots as dp
from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
from metatlas.helpers import chromatograms_mp_plots as cp
from metatlas.helpers import chromplotplus as cpp
import metatlas.metatlas_objects as metob
from metatlas.helpers import mzmine_helpers as mzm
import qgrid
from ipywidgets import interact, interactive, fixed, IntProgress
import ipywidgets as widgets
from IPython.display import display, clear_output
import time
import dill
import numpy as np
import multiprocessing as mp
import pandas as pd
import glob
import re
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
def printmd(string):
display(Markdown(string))
In [2]:
project_directory='/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/PROJECTDIRECTORY/' # <- edit this line, do not copy the path directly from NERSC (ex. the u1, or u2 directories)
output_subfolder='HILIC_POS_20190830/' # <- edit this as 'chromatography_polarity_yyyymmdd/'
output_dir = os.path.join(project_directory,output_subfolder)
output_data_qc = os.path.join(output_dir,'data_QC')
if not os.path.exists(project_directory):
os.makedirs(project_directory)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if not os.path.exists(output_data_qc):
os.makedirs(output_data_qc)
In [89]:
dp = reload(dp)
#groups = dp.select_groups_for_analysis(name = '%20190308_KBL_IG-SS_BETO_Algae_Mono26B_HF_LipV7_47560%',
groups = dp.select_groups_for_analysis(name = '%20181126_CondTann_SoilEnrich_ 20180709-Aq_HIL_FPS_MS1_QC-PrimMet-SOPv3%',
most_recent = True,
remove_empty = True,
include_list = ['QC'], exclude_list = []) #['QC','Blank']
In [34]:
for g in groups:
for f in g.items:
print f.name
Available templates in Database:
MSMLS_HILICz150mm_Annotation20190824_Template_EMA_Unlabeled_Positive\ MSMLS_HILICz150mm_Annotation20190824_Template_EMA_Unlabeled_Negative\ MSMLS_HILICz150mm_Annotation20190824_Template_QCv3_Unlabeled_Positive\ MSMLS_HILICz150mm_Annotation20190824_Template_QCv3_Unlabeled_Negative\ MSMLS_HILICz150mm_Annotation20190824_Template_ISv5_Unlabeled_Positive\ MSMLS_HILICz150mm_Annotation20190824_Template_ISv5_Unlabeled_Negative\ MSMLS_HILICz150mm_Annotation20190824_Template_ISv5_13C15N_Positive\ MSMLS_HILICz150mm_Annotation20190824_Template_ISv5_13C15N_Negative
In [ ]:
#Atlas File Name
LCS = 'MSMLS' # Library Compound Set
CTY = 'HILICz150mm' # Chromatography
LR = 'Annotation20190824' # Library Run
RTS = 'Template' # RT space
CPD = 'QCv3' # Set of Compounds
LAB = 'Unlabeled' # Isolabeling
POL = 'Positive' # Polarity
QC_template_filename = "_".join((LCS,CTY,LR,RTS,CPD,LAB,POL))
atlases = metob.retrieve('Atlas',name=QC_template_filename,
username='vrsingan')
names = []
for i,a in enumerate(atlases):
print(i,a.name,pd.to_datetime(a.last_modified,unit='s'),len(a.compound_identifications))
In [6]:
# #Alternatively use this block to create QC atlas from spreadsheet
# import datetime
#dp = reload(dp)
# LCS = 'MSMLS' # Library Compound Set
# CTY = 'HILICz150mm' # Chromatography
# LR = 'Annotation20190824' # Library Run
# RTS = 'Template' # RT space
# CPD = 'QCv3' # Set of Compounds
# LAB = 'Unlabeled' # Isolabeling
# POL = 'Positive' # Polarity
# DT = datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')
# QC_template_filename = "_".join((LCS,CTY,LR,RTS,CPD,LAB,POL,DT))
#myAtlas = dp.make_atlas_from_spreadsheet('/global/project/projectdirs/metatlas/projects/1_TemplateAtlases/TemplateAtlas_HILICz150mm_Annotation20190824_QCv3_Unlabeled_Positive.csv',
# QC_template_filename,
# filetype='csv',
# sheetname='',
# polarity = 'positive',
# store=True,
# mz_tolerance = 20)
#atlases = dp.get_metatlas_atlas(name=QC_template_filename,do_print = True,most_recent=True)
In [7]:
myAtlas = atlases[-1]
atlas_df = ma_data.make_atlas_df(myAtlas)
atlas_df['label'] = [cid.name for cid in myAtlas.compound_identifications]
print myAtlas.name
print myAtlas.username
In [8]:
# rt_allowance = 0.5
# atlas_df['rt_min'] = atlas_df['rt_peak'].apply(lambda rt: rt-rt_allowance)
# atlas_df['rt_max'] = atlas_df['rt_peak'].apply(lambda rt: rt+rt_allowance)
In [90]:
all_files = []
for my_group in groups:
for my_file in my_group.items:
all_files.append((my_file,my_group,atlas_df,myAtlas))
pool = mp.Pool(processes=min(4, len(all_files)))
t0 = time.time()
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
#If you're code crashes here, make sure to terminate any processes left open.
print time.time() - t0
In [91]:
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.metrics import mean_absolute_error as mae
ransac = RANSACRegressor(random_state=42)
qc_actual_df = dp.make_output_dataframe(input_dataset = metatlas_dataset, fieldname='rt_peak', use_labels=True) # Peak height filter??
maes, coefs, intercepts = [],[],[]
actual_rts, pred_rts = [],[]
for i in range(qc_actual_df.shape[1]):
current_actual_df = qc_actual_df.loc[:,qc_actual_df.columns[i]]
bad_qc_compounds = np.where(~np.isnan(current_actual_df))
current_actual_df = current_actual_df.iloc[bad_qc_compounds]
current_pred_df = atlas_df.iloc[bad_qc_compounds][['rt_peak']]
actual_rts.append(current_actual_df.values.tolist())
pred_rts.append(current_pred_df.values.tolist())
rt_model = ransac.fit(current_pred_df, current_actual_df)
coefs.append(rt_model.estimator_.coef_[0])
intercepts.append(rt_model.estimator_.intercept_)
maes.append(mae(rt_model.estimator_.coef_*current_pred_df+
rt_model.estimator_.intercept_, current_actual_df))
In [92]:
from sklearn.metrics import mean_absolute_error as mae
fig_legend = "FileIndex FileName"
for i in range(qc_actual_df.shape[1]):
if maes[i] == min(maes):
QCFileIndex = i
fname = qc_actual_df.columns[i][1].replace("_", "\_")
fig_legend = fig_legend+ "\n" + r"$\bf{"+ str(i)+ "}$"+"\t"+ r"$\bf{"+str(fname)+"}$"
else:
fig_legend = fig_legend+"\n"+str(i)+" "+qc_actual_df.columns[i][1]
plt.rc('font', size=10)
plt.rc('axes', labelsize=8)
plt.rc('xtick', labelsize=8)
plt.rc('ytick', labelsize=8)
fig, axes = plt.subplots(2, sharex=True)
color = 'tab:red'
axes[0].set_ylabel('coef', color=color)
axes[0].plot(coefs, color=color)
axes[0].tick_params(axis='y', labelcolor=color)
axes[0].axhline(y=np.median(rt_model.estimator_.coef_),color='orangered')
ax2 = axes[0].twinx()
color = 'tab:blue'
ax2.set_ylabel('intercept', color=color)
ax2.plot(intercepts, color=color)
ax2.tick_params(axis='y', labelcolor=color)
ax2.axhline(y=np.median(rt_model.estimator_.intercept_),color='lightblue')
ax2.set_title('Coefficients and Intercepts of Regression')
axes[1].plot(maes)
axes[1].set_xticks(np.arange(len(maes)))
axes[1].set_ylabel('MAE')
axes[1].set_xlabel('Files')
axes[1].set_title('Mean Absolute Error for all files')
plt.text(0,-0.04*qc_actual_df.shape[1], fig_legend, transform=plt.gcf().transFigure)
fig.tight_layout(pad=2.5)
plt.savefig(os.path.join(output_data_qc,'MAE_allFiles.pdf'), bbox_inches="tight")
In [93]:
#User can change to use particular qc file
import itertools
import math
from __future__ import division
from matplotlib import gridspec
x = list(itertools.chain(*pred_rts))
y = list(itertools.chain(*actual_rts))
coef = np.median(coefs)
intercept = np.median(intercepts)
rows = int(math.ceil((qc_actual_df.shape[1]+1)/5))
cols = 5
fig = plt.figure(constrained_layout=True)
gs = gridspec.GridSpec(rows, cols)
plt.rc('font', size=6)
plt.rc('axes', labelsize=6)
plt.rc('xtick', labelsize=3)
plt.rc('ytick', labelsize=3)
ax = fig.add_subplot(gs[0])
ax.scatter(x,y, s=3)
ax.plot(np.linspace(0, max(x), 100), coef*np.linspace(0, max(x), 100)+intercept, linewidth=0.5)
ax.set_title("Median of Files")
ax.set_xlabel('predicted RTs')
ax.set_ylabel('actual RTs')
for i in range(qc_actual_df.shape[1]):
coef = coefs[i]
intercept = intercepts[i]
x = list(itertools.chain(*pred_rts[i]))
y = actual_rts[i]
ax = fig.add_subplot(gs[i+1])
ax.scatter(x, y, s=3)
ax.plot(np.linspace(0, max(x),100), coefs[i]*np.linspace(0,max(x),100)+intercept, linewidth=0.5)
ax.set_title("File: "+str(i))
ax.set_xlabel('predicted RTs')
ax.set_ylabel('actual RTs')
fig_legend = "FileIndex FileName"
for i in range(qc_actual_df.shape[1]):
fig_legend = fig_legend+"\n"+str(i)+" "+qc_actual_df.columns[i][1]
fig.tight_layout(pad=0.5)
plt.text(0,-0.03*qc_actual_df.shape[1], fig_legend, transform=plt.gcf().transFigure)
plt.savefig(os.path.join(output_data_qc, 'Actual_vs_Predicted_RTs.pdf'), bbox_inches="tight")
In [14]:
# #QCFileIndex = 1
# coef = coefs[QCFileIndex]
# intercept = intercepts[QCFileIndex]
# x = list(itertools.chain(*pred_rts[QCFileIndex]))
# y = actual_rts[QCFileIndex]
# plt.scatter(x, y)
# plt.plot(np.linspace(0, max(x),100), coef*np.linspace(0,max(x),100)+intercept)
# plt.xlabel('predicted RTs')
# plt.ylabel('actual RTs')
In [15]:
# Save model
with open(os.path.join(output_data_qc,'rt_model.txt'), 'w') as f:
f.write('coef = {}\nintercept = {}\nqc_actual_rts = {}\nqc_predicted_rts = {}'.format(coef,
intercept,
', '.join([g.name for g in groups]),
myAtlas.name))
f.write('\n'+repr(rt_model.set_params()))
In [94]:
import datetime
#Atlas File Name
LCS = 'MSMLS' # Library Compound Set
CTY = 'HILICz150mm' # Chromatography
LR = 'Annotation20190824' # Library Run
RTS = 'Predicted' # RT space
CPD = 'QCv3' # Set of Compounds
LAB = 'Unlabeled' # Isolabeling
POL = 'Positive' # Polarity
FT = '' # Free Text
DT = datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')
if FT != '':
QC_predicted_filename = "_".join((LCS,CTY,LR,RTS,CPD,LAB,POL,FT,DT))+".csv"
else:
QC_predicted_filename = "_".join((LCS,CTY,LR,RTS,CPD,LAB,POL,DT))+".csv"
atlas_df = ma_data.make_atlas_df(myAtlas)
QC_atlas_df = atlas_df.copy()
QC_atlas_df['rt_peak'] = QC_atlas_df['rt_peak'].apply(lambda rt: coef*rt+intercept)
QC_atlas_df['rt_min'] = QC_atlas_df['rt_peak'].apply(lambda rt: rt-.5)
QC_atlas_df['rt_max'] = QC_atlas_df['rt_peak'].apply(lambda rt: rt+.5)
QC_atlas_df.to_csv(os.path.join(output_data_qc, QC_predicted_filename), index=False)
all_files = []
for my_group in groups:
for my_file in my_group.items:
all_files.append((my_file,my_group,QC_atlas_df,myAtlas))
pool = mp.Pool(processes=min(4, len(all_files)))
t0 = time.time()
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
#If you're code crashes here, make sure to terminate any processes left open.
print time.time() - t0
group = 'index' # 'page' or 'index' or None
save = True
share_y = True
dp.make_chromatograms(input_dataset=metatlas_dataset, group=group, share_y=share_y, save=save, output_loc=output_data_qc)
In [ ]:
dp.make_atlas_from_spreadsheet(QC_atlas_df,
QC_predicted_filename,
filetype='dataframe',
sheetname='',
polarity = 'positive',
store=True,
mz_tolerance = 20)
In [ ]:
import datetime
#Atlas File Name
LCS = 'MSMLS' # Library Compound Set
CTY = 'HILICz150mm' # Chromatography
LR = 'Annotation20190824' # Library Run
RTS = 'Template' # RT space
CPD = 'EMA' # Set of Compounds
LAB = 'Unlabeled' # Isolabeling
POL = 'Positive' # Polarity
FT = '' # Free Text
DT = datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')
EMA_template_filename = "_".join((LCS,CTY,LR,RTS,CPD,LAB,POL))
RTS = 'Predicted' # RT space
if FT != '':
EMA_predicted_filename = "_".join((LCS,CTY,LR,RTS,CPD,LAB,POL,FT,DT))+".csv"
else:
EMA_predicted_filename = "_".join((LCS,CTY,LR,RTS,CPD,LAB,POL,DT))+".csv"
atlases = metob.retrieve('Atlas',name=EMA_template_filename,
username='vrsingan')
myAtlas = atlases[-1]
EMA_atlas_df = ma_data.make_atlas_df(myAtlas)
EMA_atlas_df['label'] = [cid.name for cid in myAtlas.compound_identifications]
EMA_atlas_df['rt_peak'] = EMA_atlas_df['rt_peak'].apply(lambda rt: coef*rt+intercept)
EMA_atlas_df['rt_min'] = EMA_atlas_df['rt_peak'].apply(lambda rt: rt-.5)
EMA_atlas_df['rt_max'] = EMA_atlas_df['rt_peak'].apply(lambda rt: rt+.5)
EMA_atlas_df.to_csv(os.path.join(output_data_qc,EMA_predicted_filename), index=False)
# Optionally save in database
#dp.make_atlas_from_spreadsheet(EMA_atlas_df,
# EMA_predicted_filename,
# filetype='dataframe',
# sheetname='',
# polarity = 'positive',
# store=True,
# mz_tolerance = 20)